if (!require("pacman")) install.packages("pacman")
## Indlæser krævet pakke: pacman
pacman::p_load("edgeR")
pacman::p_load("readr")
pacman::p_load("readxl")
pacman::p_load("biomaRt")
pacman::p_load("magrittr")
pacman::p_load("tibble")
pacman::p_load("stringr")
pacman::p_load("ggplot2")
pacman::p_load("data.table")
pacman::p_load("ggplot2", "patchwork")
pacman::p_load("openxlsx")
pacman::p_load("dplyr")
pacman::p_load("data.table")
pacman::p_load("ggvenn")
pacman::p_load("pheatmap")
pacman::p_load_gh("altintasali/aamisc")
library(ggrepel)
#Optional: Install archived R packages if necessary.
# This section is commented out, as these dependencies might not be needed in the final analysis.
# pacman::p_load("qvalue", "rain", "limma", "devtools")
# url <- "https://cran.r-project.org/src/contrib/Archive/HarmonicRegression/HarmonicRegression_1.0.tar.gz"
# pkgFile <- "HarmonicRegression_1.0.tar.gz"
# download.file(url = url, destfile = pkgFile)
# install.packages(pkgs=pkgFile, type="source", repos=NULL)
# file.remove(pkgFile)
# Define color palette for visualizations
# These colors represent conditions like AF, Metformin, and Sham.
publication_colors <- c("AF" = "#285291", "Metformin" = "#7C1516", "Sham" = "#9B9B9B")
# Load raw RNA-seq count data from the specified file path.
count_file <- "../../../../RNA-seq/data/count/gene-expression-all-reverse-stranded-countReadPairs.tsv"
count <- readr::read_delim(count_file)
## Rows: 38849 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Geneid, Chr, Start, End, Strand
## dbl (61): Length, Dorado_LA, Dorado_RA, Im_A_Mets_Fan_LA, Im_A_Mets_Fan_RA, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove specific horses from the analysis (natural AF cases).
# only horses relevant to the experimental design are included.
count <- as_tibble(count)
count <- count %>%
dplyr::select(-c(Dorado_LA, Dorado_RA, Im_A_Mets_Fan_LA, Im_A_Mets_Fan_RA,
Jytte_LA, Jytte_RA, Kevin_Cook_LA, Kevin_Cook_RA,
San_Diego_LA, San_Diego_RA, Styles_LA, Styles_RA))
# Load metadata associated with the samples from an Excel file.
meta_file <- "../../../../RNA-seq/data/metadata/meta.xlsx"
meta <- readxl::read_excel(meta_file)
# Load and process gene annotation data.
# `geneinfo_file` contains information about gene identifiers, descriptions, and positions.
geneinfo_file <- "../../../../RNA-seq/data/gene_annotation/horse_gene_annotation.tsv.gz"
geneinfo <- fread(geneinfo_file)
# Set column names for the gene annotation data and remove unnecessary columns.
colnames(geneinfo)
## [1] "Gene stable ID"
## [2] "Gene stable ID version"
## [3] "Gene description"
## [4] "Chromosome/scaffold name"
## [5] "Gene start (bp)"
## [6] "Gene end (bp)"
## [7] "Strand"
## [8] "Gene name"
## [9] "NCBI gene (formerly Entrezgene) ID"
## [10] "NCBI gene (formerly Entrezgene) description"
setnames(geneinfo, new = c("ENSEMBL", "ENSEMBLv", "Description_detailed",
"Chr", "Start", "End", "Strand", "GENENAME", "ENTREZID", "Description"))
geneinfo <- geneinfo[, c("ENSEMBLv", "Description_detailed") := NULL]
geneinfo <- geneinfo[!duplicated(ENSEMBL), ]
# Merge count data with gene annotation information.
# This ensures that the count matrix has relevant gene annotations for downstream analysis.
annot <- merge(x = count[,c("Geneid", "Length")],
y = geneinfo,
by.x = "Geneid",
by.y = "ENSEMBL",
all.x = TRUE,
all.y = FALSE)
setnames(annot, old = "Geneid", new = "ENSEMBL")
annot <- data.frame(annot)
rownames(annot) <- annot$ENSEMBL
# Write the cleaned and merged annotation file for future reference.
fwrite(x = annot,
file = "../../../../RNA-seq/data/gene_annotation/horse_gene_annotation_filtered.tsv.gz",
sep = "\t")
# Clean up the metadata file and ensure correct row names.
meta <- meta %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
head(meta)
## Horse Region Group Condition
## M1_LA M1 LA Metformin LA_Metformin
## M1_RA M1 RA Metformin RA_Metformin
## M10_LA M10 LA AF LA_AF
## M10_RA M10 RA AF RA_AF
## M11_LA M11 LA Metformin LA_Metformin
## M11_RA M11 RA Metformin RA_Metformin
# Clean count matrix to remove unnecessary columns and set row names.
count <- count[, -c(2:6)] # Remove columns not relevant for differential expression analysis
count <- count %>% remove_rownames %>% column_to_rownames(var="Geneid")
head(count)
## M10_LA M10_RA M11_LA M11_RA M12_LA M12_RA M13_LA M13_RA
## ENSECAG00000013298 115 171 173 180 198 196 130 133
## ENSECAG00000019402 0 0 0 0 0 0 0 0
## ENSECAG00000052423 788 1131 1085 796 914 1091 1323 912
## ENSECAG00000011978 8 7 2 11 16 9 6 14
## ENSECAG00000005166 174 358 184 343 204 309 323 252
## ENSECAG00000032916 406 567 434 416 403 594 548 426
## M14_LA M14_RA M15_LA M15_RA M16_LA M16_RA M17_LA M17_RA
## ENSECAG00000013298 195 184 147 164 192 186 171 159
## ENSECAG00000019402 0 0 0 0 0 0 0 0
## ENSECAG00000052423 817 949 966 1328 1009 1193 1310 932
## ENSECAG00000011978 10 13 4 15 7 13 5 10
## ENSECAG00000005166 278 193 256 237 259 314 303 256
## ENSECAG00000032916 355 520 362 733 435 717 485 433
## M18_LA M18_RA M19_LA M19_RA M1_LA M1_RA M20_LA M20_RA M22_LA
## ENSECAG00000013298 122 188 148 155 179 197 304 172 131
## ENSECAG00000019402 0 0 0 0 0 0 0 0 0
## ENSECAG00000052423 1037 1185 1187 838 827 687 1900 1466 843
## ENSECAG00000011978 7 17 6 11 13 10 4 9 7
## ENSECAG00000005166 305 279 304 255 266 264 410 189 252
## ENSECAG00000032916 450 455 448 378 431 314 999 1021 431
## M22_RA M23_LA M23_RA M24_LA M24_RA M25_LA M25_RA M2_LA M2_RA
## ENSECAG00000013298 175 185 173 201 182 183 171 161 156
## ENSECAG00000019402 0 0 0 0 0 0 0 0 0
## ENSECAG00000052423 957 933 921 1012 826 1152 953 959 1324
## ENSECAG00000011978 11 8 6 4 4 5 9 8 12
## ENSECAG00000005166 250 258 263 250 247 276 244 264 247
## ENSECAG00000032916 569 420 509 427 447 451 555 542 694
## M3_LA M3_RA M4_LA M4_RA M5_LA M5_RA M6_LA M6_RA M7_LA M7_RA
## ENSECAG00000013298 133 198 134 201 131 182 134 171 130 162
## ENSECAG00000019402 0 0 0 0 0 0 0 0 0 0
## ENSECAG00000052423 670 818 726 980 908 761 839 864 926 1496
## ENSECAG00000011978 6 13 4 6 8 5 6 5 10 9
## ENSECAG00000005166 161 269 198 281 175 240 300 165 287 243
## ENSECAG00000032916 324 402 280 468 527 460 470 494 427 942
## M8_LA M8_RA M9_LA M9_RA
## ENSECAG00000013298 200 188 157 210
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 833 788 856 1004
## ENSECAG00000011978 12 20 13 18
## ENSECAG00000005166 246 213 182 241
## ENSECAG00000032916 460 526 343 426
edgeR
differential expression analysis for multilevel designs# Remove version numbers from gene names if present (e.g., "Gene.1" -> "Gene").
rownames(count) <- stringr::str_split_fixed(string = rownames(count),
pattern = '[.]',
n = 2)[,1]
# Reorder metadata and annotation tables to match the column order of the count matrix.
column_order <- names(count)
meta_reordered <- meta[column_order, , drop = FALSE]
annot_order <- rownames(count)
annot_reordered <- annot[annot_order, ]
# Create a DGEList object for `edgeR` analysis.
# The DGEList object contains the count matrix, gene annotation, and sample metadata.
d <- DGEList(counts = count, genes = annot_reordered, samples = meta_reordered)
# Filter out genes with low expression across all samples.
keep <- filterByExpr(d)
table(keep) # Display the number of genes retained after filtering.
## keep
## FALSE TRUE
## 26832 12017
# Subset the DGEList object to keep only the filtered genes.
y <- d[keep, , keep.lib.sizes=FALSE]
# Calculate normalization factors to adjust for differences in library sizes between samples.
y <- calcNormFactors(y)
# Write normalized counts to a file for multi-omics integration.
norm_counts <- cpm(y, normalized.lib.sizes=TRUE)
# Uncomment to save normalized counts for further analysis.
# write.table(norm_counts, file="../../../../RNA-seq/analysis/01_dge/output/rna_seq_data.txt", sep="\t", col.names=NA, quote=FALSE)
# Normalized counts with gene names included for easier multi-omics integration in OmicsAnalyst.
norm_counts_genenames <- as.data.frame(norm_counts)
norm_counts_genenames$ENSEMBL <- rownames(norm_counts_genenames)
norm_counts_genenames <- merge(norm_counts_genenames, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL")
# Use gene names when available, otherwise fallback to ENSEMBL IDs.
norm_counts_genenames$ID <- ifelse(is.na(norm_counts_genenames$GENENAME) | norm_counts_genenames$GENENAME == "",
norm_counts_genenames$ENSEMBL,
norm_counts_genenames$GENENAME)
# Ensure unique identifiers by appending ENSEMBL ID for duplicate gene names.
duplicates <- norm_counts_genenames$ID[duplicated(norm_counts_genenames$ID)]
norm_counts_genenames$ID <- ifelse(norm_counts_genenames$ID %in% duplicates,
paste(norm_counts_genenames$ID, norm_counts_genenames$ENSEMBL, sep = "_"),
norm_counts_genenames$ID)
rownames(norm_counts_genenames) <- norm_counts_genenames$ID
# Remove unnecessary columns for the final output.
norm_counts_genenames <- norm_counts_genenames[, !colnames(norm_counts_genenames) %in% c("ENSEMBL", "GENENAME", "ID")]
# Uncomment to save the final normalized counts with gene names.
# write.table(norm_counts_genenames, file="../../../../RNA-seq/analysis/01_dge/output/rna_seq_data_genenames.txt", sep="\t", col.names=NA, quote=FALSE)
# Create design matrix for linear modeling in edgeR
# Condition refers to different experimental conditions (e.g., Treatment, Control)
design <- model.matrix(~0 + Condition , y$samples)
colnames(design) <- gsub("Condition", "", colnames(design))
# Estimate dispersion values to model the variance of expression.
y <- estimateDisp(y, design)
# Calculate normalized expression levels (CPM - Counts Per Million)
CPM <- cpm(y)
logCPM <- cpm(y, log = TRUE)
# Transform the logCPM matrix into long format for ggplot2 visualization.
# This will allow each gene/sample pair to be plotted individually.
logCPM_melted <- data.table::melt(logCPM)
## Warning in data.table::melt(logCPM): The melt generic in data.table has been
## passed a matrix and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now deprecated
## as well. To continue using melt methods from reshape2 while both libraries are
## attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(logCPM). In the next version, this warning will become an error.
# Rename columns for better readability and consistency.
# "Var1" and "Var2" refer to the dimensions of the matrix and are renamed to "Gene" and "Sample".
setnames(x = logCPM_melted,
old = c("Var1", "Var2", "value"),
new = c("Gene", "Sample", "logCPM"))
# Add experimental conditions like "Group" or "Region" to each sample for coloring in the plots.
logCPM_melted <- data.table::merge.data.table(x = logCPM_melted,
y = data.table(Sample = rownames(meta), meta),
by = "Sample")
# Color the boxes based on the experimental condition to highlight potential differences.
ggplot(logCPM_melted, aes(x = Sample, y = logCPM)) +
geom_boxplot(aes(color = Condition)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
ggtitle("logCPM boxplots")
# Create density plots of logCPM values for each sample.
# This visualization helps to assess the distribution of expression values and identify batch effects.
ggplot(logCPM_melted, aes(x = logCPM)) +
geom_density(aes(group = Sample, color = Condition)) +
theme_bw() +
ggtitle("logCPM density distributions")
# Create a Multi-dimensional Scaling (MDS) plot for the RNA-seq data.
# This plot visualizes the similarity between samples based on expression values,
# with similar samples clustering together.
mds <- plotMDS(y, plot = FALSE)
dims <- list(p1 = c(1,2), p2 = c(1,3), p3 = c(2,3))
mds_plot <- list()
# without labels
for (i in seq_along(dims)){
mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
meta = meta,
dim = dims[[i]],
color.by = "Group",
shape.by = "Region",
legend.position = "right"
# text.by = "Horse",
# text.size = 1.5
)
}
patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# Create MDS plots with sample labels for detailed view
for (i in seq_along(dims)){
mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
meta = meta,
dim = dims[[i]],
color.by = "Group",
shape.by = "Region",
legend.position = "right",
text.by = "Horse",
text.size = 1.5
)
}
patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# Remove batch effects using the "removeBatchEffect" function.
# The "batch" parameter specifies the factor to remove (Horse ID in this case).
# This helps to isolate the effects of the experimental conditions by eliminating variability introduced by horses contributing with two samples each (LA and RA).
logCPM_batchRemoved <- removeBatchEffect(logCPM,
batch = as.factor(y$samples$Horse),
design = design)
## Coefficients not estimable: batch22 batch23
## Warning: Partial NA coefficients for 12017 probe(s)
mds <- plotMDS(logCPM_batchRemoved, plot = FALSE)
dims <- list(p1 = c(1,2), p2 = c(1,3), p3 = c(2,3))
mds_plot <- list()
meta$Group <- factor(meta$Group, levels = names(publication_colors))
# without labels
for (i in seq_along(dims)){
mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
meta = meta,
dim = dims[[i]],
color.by = "Group",
shape.by = "Region",
legend.position = "right"
# text.by = "Horse",
# text.size = 1.5
) +
scale_color_manual(values = publication_colors)
}
patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# with labels
for (i in seq_along(dims)){
mds_plot[[i]] <- aamisc::ggMDS(mds = mds,
meta = meta,
dim = dims[[i]],
color.by = "Group",
shape.by = "Region",
legend.position = "right",
text.by = "Horse",
text.size = 1.5
) +
scale_color_manual(values = publication_colors)
}
mds_plot[1]
## [[1]]
patchwork::wrap_plots(mds_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# Define contrasts to test specific hypotheses using the design matrix.
colnames(design) # Display the names of the coefficients in the design matrix to verify the structure.
## [1] "LA_AF" "LA_Metformin" "LA_Sham" "RA_AF" "RA_Metformin"
## [6] "RA_Sham"
# Create a list of contrasts based on experimental design.
con <- makeContrasts(
# Comparisons of metformin treatment vs. placebo in both left (LA) and right atria (RA).
met_vs_placebo_RA = RA_Metformin - RA_AF, # Metformin vs. Placebo in RA
met_vs_placebo_LA = LA_Metformin - LA_AF, # Metformin vs. Placebo in LA
# Comparison of regional differences (RA vs. LA) under placebo and metformin treatment.
RA_vs_LA_placebo = RA_AF - LA_AF, # RA vs. LA under Placebo
RA_vs_LA_met = RA_Metformin - LA_Metformin, # RA vs. LA under Metformin
# Average treatment effect across both atria.
AverageTreatmentEffect = (LA_Metformin + RA_Metformin)/2 - (LA_AF + RA_AF)/2,
# Interaction term: testing if the treatment effect differs between RA and LA.
met.vs.placebo_vs_RA.LA = (RA_Metformin - RA_AF) - (LA_Metformin - LA_AF),
# Comparisons of AF vs. Sham (control) in both left (LA) and right atria (RA).
AF_vs_sham_RA = RA_AF - RA_Sham, # AF vs. Sham in RA
AF_vs_sham_LA = LA_AF - LA_Sham, # AF vs. Sham in LA
# Comparison of regional differences (RA vs. LA) under Sham and AF conditions.
RA_vs_LA_sham = RA_Sham - LA_Sham, # RA vs. LA under Sham
RA_vs_LA_AF = RA_AF - LA_AF, # RA vs. LA under AF
# Average disease effect across both atria.
AverageDiseaseEffect = (LA_AF + RA_AF)/2 - (LA_Sham + RA_Sham)/2,
# Interaction term: testing if the disease effect differs between RA and LA.
placebo.vs.sham_vs_RA.LA = (RA_AF - RA_Sham) - (LA_AF - LA_Sham),
levels = design # Specify the levels of the design matrix.
)
con # Output the contrast matrix to verify that contrasts are correctly specified.
## Contrasts
## Levels met_vs_placebo_RA met_vs_placebo_LA RA_vs_LA_placebo
## LA_AF 0 -1 -1
## LA_Metformin 0 1 0
## LA_Sham 0 0 0
## RA_AF -1 0 1
## RA_Metformin 1 0 0
## RA_Sham 0 0 0
## Contrasts
## Levels RA_vs_LA_met AverageTreatmentEffect met.vs.placebo_vs_RA.LA
## LA_AF 0 -0.5 1
## LA_Metformin -1 0.5 -1
## LA_Sham 0 0.0 0
## RA_AF 0 -0.5 -1
## RA_Metformin 1 0.5 1
## RA_Sham 0 0.0 0
## Contrasts
## Levels AF_vs_sham_RA AF_vs_sham_LA RA_vs_LA_sham RA_vs_LA_AF
## LA_AF 0 1 0 -1
## LA_Metformin 0 0 0 0
## LA_Sham 0 -1 -1 0
## RA_AF 1 0 0 1
## RA_Metformin 0 0 0 0
## RA_Sham -1 0 1 0
## Contrasts
## Levels AverageDiseaseEffect placebo.vs.sham_vs_RA.LA
## LA_AF 0.5 -1
## LA_Metformin 0.0 0
## LA_Sham -0.5 1
## RA_AF 0.5 1
## RA_Metformin 0.0 0
## RA_Sham -0.5 -1
# Explanation: The "AverageDiseaseEffect" is not the same as the main disease effect, but we will leave that as we are only interested in the region-wise comparisons.
# Perform differential expression analysis using `voomLmFit` function.
# This approach is suitable for RNA-seq data with complex experimental designs, such as multilevel/paired design models.
# The `block` argument specifies a blocking factor (Horse ID)
y_raw <- d[keep, ,keep.lib.sizes=FALSE] # keep only filtered genes.
# Fit a linear model using the `voomLmFit` function with voom transformation.
# `plot = TRUE` will display the mean-variance trend, a diagnostic plot for voom transformation.
v <- voomLmFit(counts = y_raw,
design = design,
block = as.factor(y_raw$samples$Horse), # Block by Horse ID to account for repeated measures/paired design (two atrial samples per horse).
sample.weights = TRUE,
plot = TRUE)
## First sample weights (min/max) 0.2037813/1.8751727
## First intra-block correlation 0.2684471
## Final sample weights (min/max) 0.2035692/1.8656543
## Final intra-block correlation 0.2680694
# Initialize an empty list to store differential expression results for each contrast.
res <- list()
# Loop through each contrast and perform differential analysis.
for (i in colnames(con)) {
# Fit the contrast matrix to the linear model.
fit <- contrasts.fit(v, contrasts = con)
# Apply empirical Bayes moderation to the standard errors to improve statistical power.
fit <- eBayes(fit, robust = TRUE)
# Extract the top differentially expressed genes for each contrast.
res[[i]] <- topTable(fit, coef = i, number = Inf)
# Add the contrast name as a new column in the result table for easier identification.
res[[i]] <- data.frame(res[[i]], Contrast = i)
# Print the number of differentially expressed (DE) genes with adjusted p-value < 0.05.
n <- res[[i]] %>% dplyr::filter(adj.P.Val < 0.05) %>% nrow
print(paste('Number of DE genes for', i, '=', n))
}
## [1] "Number of DE genes for met_vs_placebo_RA = 4428"
## [1] "Number of DE genes for met_vs_placebo_LA = 0"
## [1] "Number of DE genes for RA_vs_LA_placebo = 5174"
## [1] "Number of DE genes for RA_vs_LA_met = 1684"
## [1] "Number of DE genes for AverageTreatmentEffect = 878"
## [1] "Number of DE genes for met.vs.placebo_vs_RA.LA = 4043"
## [1] "Number of DE genes for AF_vs_sham_RA = 1950"
## [1] "Number of DE genes for AF_vs_sham_LA = 643"
## [1] "Number of DE genes for RA_vs_LA_sham = 761"
## [1] "Number of DE genes for RA_vs_LA_AF = 5174"
## [1] "Number of DE genes for AverageDiseaseEffect = 1975"
## [1] "Number of DE genes for placebo.vs.sham_vs_RA.LA = 19"
# Combine all contrast results into a single data frame for easier output and visualization.
res_all <- do.call(rbind, res)
# Create an output Excel file with results for each contrast.
# Uncomment to save
# openxlsx::write.xlsx(x = res, file = "../../../../RNA-seq/analysis/01_dge/output/dge_results.xlsx", asTable = TRUE)
# Create an output TSV file with the combined results for all contrasts.
# Uncomment to save
#data.table::fwrite(x = res_all, file = "../../../../RNA-seq/analysis/01_dge/output/dge_results.tsv.gz", sep = "\t")
# Create p-value histograms for each contrast.
# This visualization helps assess the distribution of p-values and identify potential issues
ggplot(res_all, aes(x = P.Value)) +
geom_histogram(fill = "lightgray",
color = "black",
breaks = seq(0, 1, by = 0.05),
closed = "right",
lwd = 0.2) +
facet_wrap(~ Contrast, nrow = 3, scales = "free") +
theme_bw()
volcano_plots <- list()
for (i in names(res)){
volcano_plots[[i]] <- ggVolcano(x = res[[i]],
fdr = 0.05,
fdr.column = "adj.P.Val",
pvalue.column = "P.Value",
logFC = 0,
logFC.column = "logFC",
text.size = 2) +
theme_bw(base_size = 10) +
ggtitle(i)
}
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
# Combine all volcano plots into a single layout with 3 columns.
patchwork::wrap_plots(volcano_plots, ncol = 3)
# Print individual volcano plots for key contrasts.
print(volcano_plots[["AF_vs_sham_RA"]])
print(volcano_plots[["AF_vs_sham_LA"]])
print(volcano_plots[["met_vs_placebo_RA"]])
print(volcano_plots[["met_vs_placebo_LA"]])
# Create a combined plot of the key contrasts for easier comparison.
combined_plot <- ((volcano_plots[["AF_vs_sham_RA"]]) | (volcano_plots[["AF_vs_sham_LA"]])) / ((volcano_plots[["met_vs_placebo_RA"]]) | (volcano_plots[["met_vs_placebo_LA"]]))
print(combined_plot)
### Volcano Plot Custom
regions <- unique(meta$Region)
# Gene Name Mapping for Volcano Plots in RNA-seq Analysis
# Check if the annotation dataframe has the necessary columns
if (!"ENSEMBL" %in% colnames(annot_reordered) || !"GENENAME" %in% colnames(annot_reordered)) {
stop("Error: The annotation dataframe must contain both 'ENSEMBL' and 'GENENAME' columns.")
}
# Create a named vector for ENSEMBL to GENENAME mapping
ensembl_to_genename <- setNames(annot_reordered$GENENAME, annot_reordered$ENSEMBL)
# Map ENSEMBL IDs to GeneNames in the res list
for (contrast_name in names(res)) {
# Ensure the dataframe has ENSEMBL IDs as rownames
if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
}
# Map GENENAMEs to the dataframe using ENSEMBL IDs
res[[contrast_name]]$GENENAME <- ensembl_to_genename[res[[contrast_name]]$ENSEMBL]
# Replace NA values in GENENAME with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
res[[contrast_name]]$GENENAME[is.na(res[[contrast_name]]$GENENAME)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$GENENAME)]
}
# Source the helper function
source("volcano_helpers.R")
# Create lists to store both versions of volcano plots
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()
# Iterate over each contrast in `res` and create custom volcano plots with/without labels
for (contrast_name in names(res)) {
# Ensure the GeneName column is present in the dataframe for labeling
if (!"GeneName" %in% colnames(res[[contrast_name]])) {
# Map ENSEMBL to GeneName using the preprocessed mapping vector
res[[contrast_name]]$GeneName <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
}
# Generate volcano plots with and without labels using the helper function
volcano_plots <- create_custom_volcano_plot(
df = res[[contrast_name]],
logFC_col = "logFC",
pvalue_col = "P.Value",
adj_pvalue_col = "adj.P.Val",
contrast_name = contrast_name,
fc_cutoff = 0,
pvalue_cutoff = 0.05,
save_plot = TRUE,
output_path = "../output/",
show_labels = TRUE # Always generate both labeled and unlabeled versions
)
# Store the plots in separate lists
volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}
# Combine and display the volcano plots without labels
patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)
# Specify the output directory
output_dir <- "../output/volcano_plots"
# Ensure the directory exists
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
# List of contrasts to save
contrasts_to_save <- c("AF_vs_sham_RA", "AF_vs_sham_LA", "met_vs_placebo_RA", "met_vs_placebo_LA")
# Loop over each specified contrast and save both labeled and unlabeled plots
for (contrast_name in contrasts_to_save) {
# Print the plots to console (optional, useful for debugging)
print(volcano_plots_with_labels[[contrast_name]])
print(volcano_plots_no_labels[[contrast_name]])
# Save the plot with labels
ggsave(
filename = file.path(output_dir, paste0("volcano_plot_", contrast_name, "_With_Labels.png")),
plot = volcano_plots_with_labels[[contrast_name]],
width = 6,
height = 6,
dpi = 600
)
# Save the plot without labels
ggsave(
filename = file.path(output_dir, paste0("volcano_plot_", contrast_name, "_No_Labels.png")),
plot = volcano_plots_no_labels[[contrast_name]],
width = 6,
height = 6,
dpi = 600
)
}
## Warning: ggrepel: 1579 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1690 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 372 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 485 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 3781 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 3948 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Explorative Data Analysis ## Heatmap Generator
## Heatmap Generator using Non-Batch Corrected Values with Error Handling
generate_pheatmap_for_goid_non_batch <- function(go_file, annot, logCPM, res_all, meta, meta_reordered, goid, contrast1, contrast2, region1, region2) {
# Get GO annotation
go <- fread(go_file)
go <- go[, c("Gene stable ID", "GO term accession", "GO term name", "GO domain")]
setnames(go, new = c("ENSEMBL", "GOID", "Description", "GOdomain"))
go <- go[GOdomain != "", ]
# Filter for specified GO term
go_activity <- go[GOID == goid, "ENSEMBL"]
go_ensembl <- annot[annot$ENSEMBL %in% go_activity$ENSEMBL, c("GENENAME", "ENSEMBL")]
subset_logCPM <- logCPM[rownames(logCPM) %in% go_ensembl$ENSEMBL, ]
rownames(subset_logCPM) <- go_ensembl$GENENAME[match(rownames(subset_logCPM), go_ensembl$ENSEMBL)]
# Significant genes
sig_genes <- res_all[res_all$ENSEMBL %in% go_ensembl$ENSEMBL & res_all$adj.P.Val < 0.05, ]
sig_contrast1_genes <- sig_genes[sig_genes$Contrast == contrast1, "GENENAME"]
sig_contrast2_genes <- sig_genes[sig_genes$Contrast == contrast2, "GENENAME"]
# Check if there are any significant genes for contrast1 and region1
if (length(sig_contrast1_genes) > 0) {
# Generate heatmap for the first region using non-batch corrected values
region1_subset_logCPM <- subset_logCPM[, row.names(meta)[meta$Region == region1]]
region1_subset_logCPM <- region1_subset_logCPM[row.names(region1_subset_logCPM) %in% sig_contrast1_genes, ]
if (nrow(region1_subset_logCPM) > 0) {
pheatmap(region1_subset_logCPM,
annotation_col = meta_reordered[, "Group", drop=FALSE],
annotation_colors = list(Group = publication_colors),
scale = "row",
show_rownames = TRUE,
fontsize = 10,
fontsize_row = 8,
fontsize_col = 8,
color = colorRampPalette(viridisLite::magma(50))(50),
main = paste("Non-Batch Corrected Heatmap for", goid, "in", region1))
} else {
print(paste("No significant genes to display for contrast:", contrast1, "in region:", region1))
}
} else {
print(paste("No significant genes found for contrast:", contrast1, "in region:", region1))
}
# Check if there are any significant genes for contrast2 and region2
if (length(sig_contrast2_genes) > 0) {
# Generate heatmap for the second region using non-batch corrected values
region2_subset_logCPM <- subset_logCPM[, row.names(meta)[meta$Region == region2]]
region2_subset_logCPM <- region2_subset_logCPM[row.names(region2_subset_logCPM) %in% sig_contrast2_genes, ]
if (nrow(region2_subset_logCPM) > 0) {
pheatmap(region2_subset_logCPM,
annotation_col = meta_reordered[, "Group", drop=FALSE],
annotation_colors = list(Group = publication_colors),
scale = "row",
show_rownames = TRUE,
fontsize = 10,
fontsize_row = 8,
fontsize_col = 8,
color = colorRampPalette(viridisLite::viridis(50))(50),
main = paste("Non-Batch Corrected Heatmap for", goid, "in", region2))
} else {
print(paste("No significant genes to display for contrast:", contrast2, "in region:", region2))
}
} else {
print(paste("No significant genes found for contrast:", contrast2, "in region:", region2))
}
}
# Example usage with non-batch corrected logCPM values
generate_pheatmap_for_goid_non_batch(
go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
annot = annot,
logCPM = logCPM, # Using non-batch corrected logCPM
res_all = res_all,
meta = meta,
meta_reordered = meta_reordered,
goid = "GO:0004672", # Example GOID for protein kinase activity
contrast1 = "AF_vs_sham_RA", # Example Contrast 1
contrast2 = "AF_vs_sham_LA", # Example Contrast 2
region1 = "RA", # Example Region 1
region2 = "LA" # Example Region 2
)
# Ion-Channel Compex: GO:0034702 with non-batch corrected values
generate_pheatmap_for_goid_non_batch(
go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
annot = annot,
logCPM = logCPM, # Using non-batch corrected logCPM
res_all = res_all,
meta = meta,
meta_reordered = meta_reordered,
goid = "GO:0034702", # Ion-channel Complex
contrast1 = "met_vs_placebo_RA",
contrast2 = "met_vs_placebo_LA",
region1 = "RA",
region2 = "LA"
)
## [1] "No significant genes found for contrast: met_vs_placebo_LA in region: LA"
## Ionchannel Heatmap used in Main Figure
#Example for mitochondrial matrix GO:0005759
generate_pheatmap_for_goid_non_batch(
go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
annot = annot,
logCPM = logCPM, # Using non-batch corrected logCPM
res_all = res_all,
meta = meta,
meta_reordered = meta_reordered,
goid = "GO:0005759", # Mitochondrial Matrix
contrast1 = "AF_vs_sham_RA",
contrast2 = "AF_vs_sham_LA",
region1 = "RA",
region2 = "LA"
)
## Mitochondrial Matrix Heatmap used in Supplemnetary Figure
# Generate the heatmap and capture it as a grob
heatmap_RA <- generate_pheatmap_for_goid_non_batch(
go_file = "../../../data/gene_annotation/horse_GO.tsv.gz",
annot = annot,
logCPM = logCPM,
res_all = res_all,
meta = meta,
meta_reordered = meta_reordered,
goid = "GO:0034702",
contrast1 = "met_vs_placebo_RA",
contrast2 = "met_vs_placebo_LA",
region1 = "RA",
region2 = "LA"
)
## [1] "No significant genes found for contrast: met_vs_placebo_LA in region: LA"